Analisis de los datos de Strava¶

In [3]:
%%capture
# Install necessary packages
!pip install fitdecode
!pip install Path
!pip install zipfile36
!pip install dateparser
In [4]:
# Let plotly know it runs inside a Jupyter Notebook
import plotly.io as pio
pio.renderers.default = 'notebook'

Importing the necesary data

In [7]:
# Importing packages
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import dateparser

# Positions of relevant columns
usecols = [0, 1, 2, 3, 4, 5, 7, 8, 12, 16, 17, 18, 20, 31]

# English column names
names = [
    "Activity ID",
    "Activity Date",
    "Activity Name",
    "Activity Type",
    "Activity Description",
    "Elapsed Time",
    "Max Heart Rate",
    "Relative Effort",
    "Filename",
    "Moving Time",
    "Distance",
    "Max Pace",
    "Elevation Gain",
    "Average Heart Rate",
]

# Reading the raw data for reference
raw = pd.read_csv("activities.csv")

# Reading the raw data with preprocessing
df = pd.read_csv(
    "activities.csv",
    index_col=0,
    parse_dates=[1],
    usecols=usecols,
    names=names,
    header=0,
    date_parser=dateparser.parse,
)

print(f"{raw.shape[0]} rows in raw file")

# Drop columns with missing values in Moving Time and Distance
df = df.dropna(axis=0, subset=["Moving Time", "Distance"])

print(f"{df.shape[0]} rows remaining after cleaning")

# Creating new columns:
# Add day, week, month, quarter, year columns
names = ["Day", "Week", "Month", "Quarter", "Year"]
periods = ["D", "W", "M", "Q", "y"]
for n, p in zip(names, periods):
    df.insert(3, n, df["Activity Date"].dt.to_period(p).astype(str))
# Convert moving time from seconds to hours
df.insert(13, "Moving Time (hr)", df["Moving Time"] / 3600)
# Convert distance from meters to kilometers
df.insert(16, "Distance (km)", df["Distance"] / 1000)
# Calculate average speed
df.insert(17, "Average Speed (km/hr)", df["Distance (km)"] / df["Moving Time (hr)"])

# Calculate maximum speed
df.insert(18, "Max Speed (km/hr)", df["Max Pace"]*3.6)

# Print date bounds of the data
print(f"Ranges from {df.Day.min()} to {df.Day.max()}")

# Preview the data
df.tail()
/tmp/ipykernel_3631/395550362.py:33: FutureWarning:

The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.

550 rows in raw file
550 rows remaining after cleaning
Ranges from 2018-09-09 to 2023-08-30
Out[7]:
Activity Date Activity Name Activity Type Year Quarter Month Week Day Activity Description Elapsed Time ... Filename Moving Time (hr) Moving Time Distance Distance (km) Average Speed (km/hr) Max Speed (km/hr) Max Pace Elevation Gain Average Heart Rate
Activity ID
9638976449 2023-08-13 09:34:26 Bicicleta a la hora del almuerzo Bicicleta 2023 2023Q3 2023-08 2023-08-07/2023-08-13 2023-08-13 NaN 9806 ... activities/10334897911.fit.gz 2.180833 7851.0 44452.949219 44.452949 20.383469 55.281095 15.355860 1087.0 NaN
9644330136 2023-08-05 16:55:14 Carrera de noche Carrera 2023 2023Q3 2023-08 2023-07-31/2023-08-06 2023-08-05 NaN 1870 ... activities/10340562580.fit.gz 0.516111 1858.0 6679.859863 6.679860 12.942678 26.819999 7.450000 0.0 185.839615
9683261634 2023-08-20 06:21:37 Bicicleta por la mañana Bicicleta 2023 2023Q3 2023-08 2023-08-14/2023-08-20 2023-08-20 NaN 4957 ... activities/10381618854.fit.gz 1.183333 4260.0 24721.830078 24.721830 20.891687 49.886248 13.857291 535.0 NaN
9750863053 2023-08-21 17:16:47 Bicicleta al anochecer Bicicleta 2023 2023Q3 2023-08 2023-08-21/2023-08-27 2023-08-21 NaN 6049 ... activities/10453731608.fit.gz 1.405556 5060.0 25067.339844 25.067340 17.834471 36.490079 10.136133 62.0 NaN
9750871220 2023-08-30 15:58:50 Bicicleta por la tarde Bicicleta 2023 2023Q3 2023-08 2023-08-28/2023-09-03 2023-08-30 NaN 5646 ... activities/10453739983.fit.gz 1.412778 5086.0 29676.060547 29.676061 21.005470 54.762891 15.211914 667.0 NaN

5 rows × 22 columns

Calculating the cumulative sums

In [8]:
# Define a time unit: "Year", "Quarter", "Month", "Week", or "Day"
time_unit_km = "Month"

# Group by time_unit_km and activity type
df_km = df.groupby(by=[time_unit_km, "Activity Type"], as_index=False).agg(
    count=("Distance (km)", "count"),
    total_distance_km=("Distance (km)", "sum"),
    avg_distance_km=("Distance (km)", "mean"),
)

# For each activity and time period, make sure there exists a row
# This will ensure there is point on the plot for each combination
acts = df_km["Activity Type"].unique()
times = df_km[time_unit_km].unique()

# Create a list to store new rows
new_rows = []

for a in acts:
    temp = df_km.loc[df_km["Activity Type"] == a]
    for t in times:
        if not (temp[time_unit_km] == t).any():
            new_row = {
                time_unit_km: t,
                "Activity Type": a,
                "count": 0,
                "total_distance_km": 0,
                "avg_distance_km": 0,
            }
            new_rows.append(new_row)

# Concatenate the new rows to the DataFrame
df_km = pd.concat([df_km, pd.DataFrame(new_rows)], ignore_index=True)
# Find and exclude activities with <= 1 km total covered (e.g., weight training)
# You can increase or decrease this cutoff based on your data
kms = df_km.groupby(by=["Activity Type"], as_index=False).sum()
kms = kms[kms["total_distance_km"] > 1]

# For each activity and time period, calculate the cumulative sum of kms
csum = df_km.loc[df_km["Activity Type"].isin(kms["Activity Type"])]
x = pd.Series(dtype=float)
csum.sort_values(by=["Activity Type", time_unit_km], inplace=True)
csum["csum_km"] = csum.groupby("Activity Type")["total_distance_km"].cumsum()

csum
Out[8]:
Month Activity Type count total_distance_km avg_distance_km csum_km
0 2018-09 Bicicleta 6 159.075098 26.512516 159.075098
100 2018-10 Bicicleta 0 0.000000 0.000000 159.075098
2 2018-11 Bicicleta 1 25.858400 25.858400 184.933499
4 2018-12 Bicicleta 2 25.319801 12.659900 210.253299
5 2019-01 Bicicleta 1 44.595602 44.595602 254.848901
... ... ... ... ... ... ...
91 2023-04 Carrera 14 126.872651 9.062332 2063.925490
93 2023-05 Carrera 14 147.999292 10.571378 2211.924782
95 2023-06 Carrera 16 137.116259 8.569766 2349.041041
97 2023-07 Carrera 9 91.196648 10.132961 2440.237690
99 2023-08 Carrera 2 11.947680 5.973840 2452.185369

165 rows × 6 columns

Análisis general¶

Gráfica de los kilometros recorridos por tipo de actividad

In [9]:
# For the plot tile
total_km = round(df_km["total_distance_km"].sum())

# Plot a stacked area plot
fig_km = px.area(
    csum,
    x=time_unit_km,
    y="csum_km",
    color="Activity Type",
    title=f"My {total_km} Kilometers on Strava!",  # Set title text
    hover_data={  # Define variables for hover text
        "csum_km": ":.1f",
        "count": ":f",
        "total_distance_km": ":.1f",
        "avg_distance_km": ":.1f",
    },
    labels=dict(  # Define labels for variables
        count="Number of activities",
        avg_distance_km="Average kms per activity",
        total_distance_km="Total kms covered",
        csum_km="Cumulative kms covered",
    ),
    color_discrete_sequence=px.colors.qualitative.Bold,  # Define color swatch
)

# Set max allowed of ticks on x and y axes
fig_km.update_xaxes(nticks=20)
fig_km.update_yaxes(nticks=15)

# Adjust the size and layout
fig_km.update_layout(
    autosize=False,
    width=700,
    height=500,
    template="plotly_white",  # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
    title={"y": 0.9, "x": 0.5, "xanchor": "center", "yanchor": "top"},  # Center title
)

fig_km.show()
  • Las salidas en bicicleta acumulan la gran parte de los kilometros recorridos

Horas dedicadas a cada tipo de actividad

In [10]:
# Define a time unit: "Year", "Quarter", "Month", "Week", or "Day"
time_unit_bar = "Month"

# Group by time_unit_bar and activity type
df_hr = df.groupby(by=[time_unit_bar, "Activity Type"], as_index=False).agg(
    count=("Moving Time (hr)", "count"),
    total_hr_spent=("Moving Time (hr)", "sum"),
    avg_hr_spent=("Moving Time (hr)", "mean"),
)

# For the plot tile
total_hr = round(df_hr["total_hr_spent"].sum())

# Plot a stacked bar plot
fig_hr = px.bar(
    df_hr,
    x=time_unit_bar,
    y="total_hr_spent",
    color="Activity Type",
    title=f"My {total_hr} hours on Strava!",  # Set title text
    hover_data={  # Define variables for hover text
        "count": ":f",
        "total_hr_spent": ":.1f",
        "avg_hr_spent": ":.1f",
    },
    labels=dict(  # Define labels for variables
        total_hr_spent="Total hrs spent",
        count="Number of activities",
        avg_hr_spent="Average hrs spent per activity",
    ),
    color_discrete_sequence=px.colors.qualitative.Bold,  # Define color swatch
)

# Set max allowed of ticks on x and y axes
fig_hr.update_xaxes(nticks=20)
fig_hr.update_yaxes(nticks=15)

# Adjust the size and layout
fig_hr.update_layout(
    autosize=False,
    width=700,
    height=500,
    template="plotly_white",  # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
    legend=dict(  # Move the legend to the bottom
        orientation="h",
        yanchor="bottom",
        y=-0.6,
        xanchor="right",
        x=1,
        title=None,  # Remove legend title
    ),
    title={"y": 0.9, "x": 0.5, "xanchor": "center", "yanchor": "top"},  # Center title
)

fig_hr.show()
  • Se observa una evolución histórica desde más horas de bicicleta a más horas de correr
  • La estacionalidad muestra que los meses de verano hay más horas dedicadas a la bicicleta

Análisis de las actividades de carrera¶

In [14]:
# Find the activity with the most kms
most_kms = (
    df_km.groupby(by=["Activity Type"], as_index=False)
    .sum()
    .sort_values(by="total_distance_km")
)

# Define an activity here
activity = most_kms["Activity Type"].values[1]

# Generating four bins based on the Distance column
speed = df.loc[df["Activity Type"] == activity]
cats, bins = pd.cut(speed["Distance (km)"], 4, precision=0, retbins=True)
bins = np.around(bins, 0).astype(int)
bin_labels = []
for i in range(0, 4):
    bin_labels.append(
        f"{bins[i]}-{str(bins[i + 1])}km {activity.lower()}s"
    )
speed.insert(
    0, "distance_bin", pd.cut(speed["Distance (km)"], 4, precision=0, labels=bin_labels)
)

# Create a scatter plot with four subplots
fig_s = px.scatter(
    speed,
    x="Activity Date",
    y="Average Speed (km/hr)",
    facet_col="distance_bin",
    color="Distance (km)",
    trendline="ols",  # Add a black trend line
    trendline_color_override="black",
    color_continuous_scale="thermal",  # Define a color scale
    title="My average speed on " + activity.lower() + "s",  # Set title text
    category_orders={"distance_bin": bin_labels},  # Ascending order
    custom_data=["Activity Name", "Distance (km)", "Elevation Gain"],  # Variables for the hover text
)

# Customize the hover text
fig_s.update_traces(
    hovertemplate="Activity Name: %{customdata[0]}<br>"
    "Activity date: %{x|%Y-%m-%d}<br>"
    "Distance (km): %{customdata[1]:.1f}<br>"
    "Average speed (km/hr): %{y:.1f}<br>"
    "Elevation gain: %{customdata[2]:.1f}"
)

# Adjust the size and layout
fig_s.update_layout(
    autosize=False,
    width=700,
    height=600,
    template="seaborn",  # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
)

# Hide subplot annotations and x-axis titles
fig_s.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for axis in fig_s.layout:
    if type(fig_s.layout[axis]) == go.layout.XAxis:
        fig_s.layout[axis].title.text = ""

# Rotate xtick labels
fig_s.update_xaxes(tickangle=-45)

# Set size of bin labels
fig_s.update_annotations(font=dict(size=10))

# Make the color bar smaller
fig_s.update_coloraxes(
    colorbar_thickness=15,
    colorbar_title_text="km",
    colorbar_title_font_size=12,
    colorbar_tickfont_size=10,
    colorbar_ticklen=3,
)

fig_s.show()
In [24]:
# Find the activity with the most kms
most_kms = (
    df_km.groupby(by=["Activity Type"], as_index=False)
    .sum()
    .sort_values(by="total_distance_km")
)

# Define an activity here
activity = most_kms["Activity Type"].values[1]

# Generating four bins based on the Distance column
speed = df.loc[df["Activity Type"] == activity]
cats, bins = pd.cut(speed["Distance (km)"], 4, precision=0, retbins=True)
bins = np.around(bins, 0).astype(int)
bin_labels = []
for i in range(0, 4):
    bin_labels.append(
        f"{bins[i]}-{str(bins[i + 1])}km {activity.lower()}s"
    )
speed.insert(
    0, "distance_bin", pd.cut(speed["Distance (km)"], 4, precision=0, labels=bin_labels)
)

# Create a scatter plot with four subplots
fig_s = px.scatter(
    speed,
    x="Distance (km)",
    y="Average Speed (km/hr)",
    color="Elevation Gain",
    trendline="ols",  # Add a black trend line
    trendline_color_override="black",
    color_continuous_scale="thermal",  # Define a color scale
    title="My average speed on " + activity.lower() + "s",  # Set title text
    custom_data=["Elevation Gain", "Activity Date"],  # Variables for the hover text
)

# Customize the hover text
fig_s.update_traces(
    hovertemplate=
    "Activity Date: %{customdata[1]|%Y-%m-%d}<br>"
    "Distance (km): %{x:.1f}<br>"
    "Average speed (km/hr): %{y:.1f}<br>"
    "Elevation gain: %{customdata[0]:.1f}"
)

# Adjust the size and layout
fig_s.update_layout(
    autosize=False,
    width=700,
    height=600,
    template="seaborn",  # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
)


# Rotate xtick labels
fig_s.update_xaxes(tickangle=-45)

# Set size of bin labels
fig_s.update_annotations(font=dict(size=10))

# Make the color bar smaller
fig_s.update_coloraxes(
    colorbar_thickness=15,
    colorbar_title_text="Elevation gain",
    colorbar_title_font_size=12,
    colorbar_tickfont_size=10,
    colorbar_ticklen=3,
)

fig_s.show()
In [30]:
# Define a time unit: "Year", "Quarter", "Month", "Week", or "Day"
time_unit_bar = "Week"
df_carrera=df[(df['Activity Type']=='Carrera') & (df['Activity Date']>"2022-09-01 00:00:00")]
# Group by time_unit_bar and activity type
df_hr = df_carrera.groupby(by=[time_unit_bar, "Activity Type"], as_index=False).agg(
    count=("Moving Time (hr)", "count"),
    total_hr_spent=("Distance (km)", "sum"),
    avg_hr_spent=("Distance (km)", "mean"),
)

# For the plot tile
total_hr = round(df_hr["total_hr_spent"].sum())

# Plot a stacked bar plot
fig_hr = px.bar(
    df_hr,
    x=time_unit_bar,
    y="total_hr_spent",
    color="avg_hr_spent",
    title=f"My {total_hr} hours on Strava!",  # Set title text
    hover_data={  # Define variables for hover text
        "count": ":f",
        "total_hr_spent": ":.1f",
        "avg_hr_spent": ":.1f",
    },
    labels=dict(  # Define labels for variables
        total_hr_spent="Total hrs spent",
        count="Number of activities",
        avg_hr_spent="Average hrs spent per activity",
    ),
    color_discrete_sequence=px.colors.qualitative.Bold,  # Define color swatch
)

# Set max allowed of ticks on x and y axes
fig_hr.update_xaxes(nticks=20)
fig_hr.update_yaxes(nticks=15)

# Adjust the size and layout
fig_hr.update_layout(
    autosize=False,
    width=700,
    height=500,
    template="plotly_white",  # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
    legend=dict(  # Move the legend to the bottom
        orientation="h",
        yanchor="bottom",
        y=-0.6,
        xanchor="right",
        x=1,
        title=None,  # Remove legend title
    ),
    title={"y": 0.9, "x": 0.5, "xanchor": "center", "yanchor": "top"},  # Center title
)

fig_hr.show()

Análisis de las actividades de bicicleta¶

In [31]:
# Find the activity with the most kms
most_kms = (
    df_km.groupby(by=["Activity Type"], as_index=False)
    .sum()
    .sort_values(by="total_distance_km")
)

# Define an activity here
activity = most_kms["Activity Type"].values[-1]

# Generating four bins based on the Distance column
speed = df.loc[df["Activity Type"] == activity]
cats, bins = pd.cut(speed["Distance (km)"], 4, precision=0, retbins=True)
bins = np.around(bins, 0).astype(int)
bin_labels = []
for i in range(0, 4):
    bin_labels.append(
        f"{bins[i]}-{str(bins[i + 1])}km {activity.lower()}s"
    )
speed.insert(
    0, "distance_bin", pd.cut(speed["Distance (km)"], 4, precision=0, labels=bin_labels)
)

# Create a scatter plot with four subplots
fig_s = px.scatter(
    speed,
    x="Activity Date",
    y="Average Speed (km/hr)",
    facet_col="distance_bin",
    color="Distance (km)",
    trendline="ols",  # Add a black trend line
    trendline_color_override="black",
    color_continuous_scale="thermal",  # Define a color scale
    title="My average speed on " + activity.lower() + "s",  # Set title text
    category_orders={"distance_bin": bin_labels},  # Ascending order
    custom_data=["Activity Name", "Distance (km)", "Elevation Gain"],  # Variables for the hover text
)

# Customize the hover text
fig_s.update_traces(
    hovertemplate="Activity Name: %{customdata[0]}<br>"
    "Activity date: %{x|%Y-%m-%d}<br>"
    "Distance (km): %{customdata[1]:.1f}<br>"
    "Average speed (km/hr): %{y:.1f}<br>"
    "Elevation gain: %{customdata[2]:.1f}"
)

# Adjust the size and layout
fig_s.update_layout(
    autosize=False,
    width=700,
    height=600,
    template="seaborn",  # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
)

# Hide subplot annotations and x-axis titles
fig_s.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for axis in fig_s.layout:
    if type(fig_s.layout[axis]) == go.layout.XAxis:
        fig_s.layout[axis].title.text = ""

# Rotate xtick labels
fig_s.update_xaxes(tickangle=-45)

# Set size of bin labels
fig_s.update_annotations(font=dict(size=10))

# Make the color bar smaller
fig_s.update_coloraxes(
    colorbar_thickness=15,
    colorbar_title_text="km",
    colorbar_title_font_size=12,
    colorbar_tickfont_size=10,
    colorbar_ticklen=3,
)

fig_s.show()
In [35]:
# Find the activity with the most kms
most_kms = (
    df_km.groupby(by=["Activity Type"], as_index=False)
    .sum()
    .sort_values(by="total_distance_km")
)

# Define an activity here
activity = most_kms["Activity Type"].values[-1]

# Generating four bins based on the Distance column
speed = df.loc[df["Activity Type"] == activity]
cats, bins = pd.cut(speed["Distance (km)"], 4, precision=0, retbins=True)
bins = np.around(bins, 0).astype(int)
bin_labels = []
for i in range(0, 4):
    bin_labels.append(
        f"{bins[i]}-{str(bins[i + 1])}km {activity.lower()}s"
    )
speed.insert(
    0, "distance_bin", pd.cut(speed["Distance (km)"], 4, precision=0, labels=bin_labels)
)

# Create a scatter plot with four subplots
fig_s = px.scatter(
    speed,
    x="Elevation Gain",
    y="Average Speed (km/hr)",
    color="Distance (km)",
    trendline="ols",  # Add a black trend line
    trendline_color_override="black",
    color_continuous_scale="thermal",  # Define a color scale
    title="My average speed on " + activity.lower() + "s",  # Set title text
    custom_data=["Elevation Gain", "Activity Date"],  # Variables for the hover text
)

# Customize the hover text
fig_s.update_traces(
    hovertemplate=
    "Activity Date: %{customdata[1]|%Y-%m-%d}<br>"
    "Distance (km): %{x:.1f}<br>"
    "Average speed (km/hr): %{y:.1f}<br>"
    "Elevation gain: %{customdata[0]:.1f}"
)

# Adjust the size and layout
fig_s.update_layout(
    autosize=False,
    width=700,
    height=600,
    template="seaborn",  # Others options: "plotly", "plotly_dark", "ggplot2", "seaborn", "simple_white"
)


# Rotate xtick labels
fig_s.update_xaxes(tickangle=-45)

# Set size of bin labels
fig_s.update_annotations(font=dict(size=10))

# Make the color bar smaller
fig_s.update_coloraxes(
    colorbar_thickness=15,
    colorbar_title_text="Elevation gain",
    colorbar_title_font_size=12,
    colorbar_tickfont_size=10,
    colorbar_ticklen=3,
)

fig_s.show()
In [ ]: